The data is in tab-separated test (.txt) format. Data is imported
using fred function from data.table package.
The sample metadat is vaialable in sampleTable.csv file and
is read in using read.csv function.
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
Count matrix Stats: Features(genes) : 35741 Number of samples : 11
Sample MeatadataIn this step we will filter out genes with very low counts across the samples. DESeq2 performs independent filtering but reducing the dimension of input dataset can speed up the process in large datasets.
genesInAssay<-dim(counts)[1]
# Filtering genes with some parameters.
minimumCountpergene=10
MinSampleWithminimumgeneCounts=4
counts<-counts[rowSums(data.frame(counts>minimumCountpergene))>MinSampleWithminimumgeneCounts,]
cat(c("\n Total genes before filtering : ",genesInAssay),sep=" ",append=TRUE)
cat(c("\n Minimum Counts per gene : ",minimumCountpergene),sep=" ",append = TRUE)
cat(c("\n Minimum Samlpes with Minimum Counts per gene : ",MinSampleWithminimumgeneCounts),sep=" ",append = TRUE)
cat(c("\n Total genes after filtering : ",dim(counts)[1]),sep=" ",append=TRUE)
##
## Total genes before filtering : 35741
## Minimum Counts per gene : 10
## Minimum Samlpes with Minimum Counts per gene : 4
## Total genes after filtering : 15036
In this step we will perform few QC steps to check data completeness and will also try to identify if any sample is behaving odd from the raw counts perspective.
Test that all the input samples in sampleTable have their
corresponding counts in the counts object.
if (sum(colnames(counts) %in% sampleTable$sample) == length(sampleTable$sample)){
message("Good News !!! Samples in count matrix matches with that of in sampleTable")
}
## Good News !!! Samples in count matrix matches with that of in sampleTable
cat("\n\n Matching order of samples between counts and sampleTable \n")
counts<-counts[sampleTable$sample]
cat("\n Is the sample orders identical between counts matrix and sampletable \n")
identical(colnames(counts),sampleTable$sample)
cat("\n\n Convert metadata in categorical data whereever applicable\n")
sampleTable$condition<-as.factor(sampleTable$condition)
str(sampleTable)
##
##
## Matching order of samples between counts and sampleTable
##
## Is the sample orders identical between counts matrix and sampletable
## [1] TRUE
##
##
## Convert metadata in categorical data whereever applicable
## 'data.frame': 12 obs. of 5 variables:
## $ sample : chr "Control-1" "Control-2" "Control-3" "PNA-10b-1" ...
## $ condition : Factor w/ 4 levels "control","PNA.10b",..: 1 1 1 2 2 2 3 3 3 4 ...
## $ Cells : chr "U87" "U87" "U87" "U87" ...
## $ treat.info : chr "mock" "mock" "mock" "sgPNA-10b/BNP (150 nM sgPNA-10b)" ...
## $ Incubation.time: chr "72h" "72h" "72h" "72h" ...
Quality check of samples to identify outliers or odd behaving samples.
df_qc<-skim(counts)
head(df_qc)
| Name | counts |
| Number of rows | 15036 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Control-1 | 0 | 1 | 1017.52 | 3156.94 | 1 | 60 | 347 | 1009.00 | 197719 | ▇▁▁▁▁ |
| Control-2 | 0 | 1 | 1131.79 | 3261.55 | 0 | 70 | 395 | 1155.00 | 192146 | ▇▁▁▁▁ |
| Control-3 | 0 | 1 | 852.65 | 2749.13 | 0 | 50 | 283 | 836.00 | 180138 | ▇▁▁▁▁ |
| PNA-10b-1 | 0 | 1 | 931.03 | 2699.00 | 0 | 54 | 318 | 937.00 | 145053 | ▇▁▁▁▁ |
| PNA-10b-2 | 0 | 1 | 1126.39 | 3039.44 | 0 | 66 | 393 | 1157.25 | 149187 | ▇▁▁▁▁ |
| PNA-10b-3 | 0 | 1 | 1059.67 | 2912.07 | 0 | 65 | 376 | 1092.00 | 153101 | ▇▁▁▁▁ |
Look at the distribution of mean and std.deviation of counts across all samples. #### Sample Counts : Mean and Standard Deviation {.tabset}
In this analysis control samples are used as refrence
condition for statistical analysis.
# reassuring that order of samples in rows of sampleTable are identical with the sample order in count matrix
counts<-counts[sampleTable$sample]
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts),
colData=sampleTable,
design = ~condition)
# Setting "control" as baseline or reference
dds$condition<-relevel(dds$condition, ref=ref_level)
dds
## class: DESeqDataSet
## dim: 15036 12
## metadata(1): version
## assays(1): counts
## rownames(15036): ENSG00000000003 ENSG00000000419 ... ENSG00000288380
## ENSG00000288398
## rowData names(0):
## colnames(12): Control-1 Control-2 ... PNA-10b+21-2 PNA-10b+21-3
## colData names(5): sample condition Cells treat.info Incubation.time
dds<-DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Accessing results
resultsNames(dds)
## [1] "Intercept" "condition_PNA.10b_vs_control"
## [3] "condition_PNA.21_vs_control" "condition_PNA10b.21_vs_control"
## Access and saving normalised counts
normcounts<-as.data.frame(counts(dds,normalized =TRUE))
normAnnot<-merge(normcounts,ref, by=0, all.x=T)
normAnnot = data.frame(lapply(normAnnot, as.character), stringsAsFactors=FALSE)
write.csv(normAnnot,file="NormalisedCounts.csv")
Sample to sample Distance Matrix
rld <- rlogTransformation(dds, blind=T)
vsd <- varianceStabilizingTransformation(dds, blind=T)
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
sampleDistMatrix_meta<-merge(sampleDistMatrix,sampleTable, by=0, sort=FALSE) %>% column_to_rownames(., var="Row.names")
head(sampleDistMatrix_meta)
Principal component Analysis
Principal Component Analysis (PCA) is a widely used statistical method in the analysis of differential gene expression data. It helps in understanding the underlying structure of high-dimensional data, like gene expression profiles, by reducing its complexity. PCA achieves this by transforming the original variables into a new set of uncorrelated variables called principal components, which are ordered so that the first few retain most of the variation present in all of the original variables.
In the context of differential gene expression, PCA is particularly useful for several reasons:
Data Visualization: PCA allows for the visualization of complex gene expression data in two or three dimensions. This can help in identifying patterns, trends, or clusters in the data that might indicate similarities or differences in gene expression under various conditions or among different samples.
Noise Reduction: By focusing on principal components that capture the most variance, PCA can help filter out noise and reduce the dimensionality of the data, making further analysis more tractable.
Identification of Important Genes: The loading scores of the principal components can be used to identify genes that contribute most to the variance in the data. These genes can be candidates for further study as they might play significant roles in the biological processes or conditions being studied.
Batch Effect Correction: PCA can help in identifying and correcting for batch effects, which are technical non-biological differences between batches of samples that can affect gene expression levels.
Comparative Analysis: By comparing the PCA results of different conditions or experiments, researchers can identify shifts in the principal component space, indicating changes in gene expression profiles associated with different biological states or treatments.
rv <- rowVars(assay(rld))
select <- order(rv, decreasing=T)[seq_len(min(500,length(rv)))]
pc <- prcomp(t(assay(vsd)[select,]))
scores <- data.frame(pc$x)
scores<-cbind(scores,sampleTable)
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 5.7403 2.4383 1.69291 1.35859 1.16329 1.04655 0.95398
## Proportion of Variance 0.6719 0.1212 0.05844 0.03763 0.02759 0.02233 0.01856
## Cumulative Proportion 0.6719 0.7931 0.85152 0.88915 0.91674 0.93907 0.95763
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.80214 0.74085 0.68803 0.64212 2.553e-14
## Proportion of Variance 0.01312 0.01119 0.00965 0.00841 0.000e+00
## Cumulative Proportion 0.97075 0.98194 0.99159 1.00000 1.000e+00
pc1prcnt=as.character(as.integer(summary(pc)$importance[2,1]*100))
pc2prcnt=as.character(as.integer(summary(pc)$importance[2,2]*100))
pc3prcnt=as.character(as.integer(summary(pc)$importance[2,3]*100))
pc4prcnt=as.character(as.integer(summary(pc)$importance[2,4]*100))
pc5prcnt=as.character(as.integer(summary(pc)$importance[2,5]*100))
pc6prcnt=as.character(as.integer(summary(pc)$importance[2,6]*100))
names<-rownames(scores)
pc12 <-ggplot(scores, aes(x = PC1, y = PC2, label=names,col = (factor(condition))))+
geom_point(size = 5)+
geom_text_repel(label=names,col="black")+
ggtitle("Principal Components")+
xlab(paste0("PC1 (",pc1prcnt,"%)")) +
ylab(paste0("PC2 (",pc2prcnt,"%)"))
pc34 <-ggplot(scores, aes(x = PC3, y = PC4, label=names,col = (factor(condition))))+
geom_point(size = 5)+
geom_text_repel(label=names,col="black")+
ggtitle("Principal Components")+
xlab(paste0("PC3 (",pc3prcnt,"%)")) +
ylab(paste0("PC4 (",pc4prcnt,"%)"))
pc56 <-ggplot(scores, aes(x = PC5, y = PC6, label=names,col = (factor(condition))))+
geom_point(size = 5)+
geom_text_repel(label=names,col="black")+
ggtitle("Principal Components")+
xlab(paste0("PC5 (",pc5prcnt,"%)")) +
ylab(paste0("PC6 (",pc6prcnt,"%)"))
comparison_matrix<-resultsNames(dds)[-1]
source("../../DataAnalysis/Rscripts/volcanoCode_updatedApr2024.R")
resout<-list()
shrink_plots_list <- list()
volcano_plot_list <- list()
ma_plot_list <- list()
padj_threshold=0.05
lf_raw_shrink_plot<-function(raw_res,shrink_res,padj_threshold=0.05){
plotdf<-data.frame("lfraw"=raw_res$log2FoldChange,"lfshrink"=shrink_res$log2FoldChange)
plotdf["significant"]<-as.factor(ifelse(raw_res$padj<padj_threshold & abs(shrink_res$log2FoldChange)>0.58,"Significant","NotSignificant"))
ggplot(plotdf,aes(lfraw,lfshrink,col=significant)) + geom_point()+
ggtitle(paste0("Raw.Log2FC vs Shrinked.Log2FC","_",AnalysisID))
}
for (i in 1:3){
AnalysisID <- paste0(comparison_matrix[i],"_14March2024")
raw_res <- results(dds, name = comparison_matrix[i])
shrink_res <- lfcShrink(dds,type="ashr",coef = comparison_matrix[i],quiet=TRUE)
raw_res_anno <- merge(as.data.frame(raw_res) ,ref, by=0, all.x=TRUE, sort=FALSE)
shrink_res_anno <- merge(as.data.frame(shrink_res),ref, by=0, all.x=TRUE,sort=FALSE)
resout[[paste0(AnalysisID,"_raw")]] <- raw_res_anno
resout[[paste0(AnalysisID,"_shrink")]] <- shrink_res_anno
plotdf<-data.frame("lfraw"=raw_res$log2FoldChange,"lfshrink"=shrink_res$log2FoldChange)
plotdf["significant"]<-as.factor(ifelse(raw_res$padj<padj_threshold & abs(shrink_res$log2FoldChange)>0.58,"Significant","NotSignificant"))
raw_shrink_plots<- ggplot(plotdf,aes(lfraw,lfshrink,col=significant)) + geom_point()+
ggtitle(paste0("Raw.Log2FC vs Shrinked.Log2FC","_",AnalysisID))
Vol_plot<-volcanoPlot(shrink_res_anno,
geneName.col="gene_name",
plot.title=paste0("Shrink_Volcano Plot :",AnalysisID),
lfc_cut_off=0.58)
volcano_plot_list[[AnalysisID]] <- Vol_plot
raw_Vol_plot<-volcanoPlot(raw_res_anno,
geneName.col="gene_name",
plot.title=paste0("Raw_Volcano Plot :",AnalysisID),
lfc_cut_off=0.58)
#raw_ma <- plotMA(raw_res, ylim=c(-2,2))
#shrink_ma<- plotMA(shrink_res, ylim=c(-2,2))
ma_plot_list[[paste0("RAW_",AnalysisID)]]<- raw_res
ma_plot_list[[paste0("Shrink_",AnalysisID)]] <- shrink_res
volcano_plot_list[[paste0("RAW_",AnalysisID)]] <- raw_Vol_plot
shrink_plots_list[[paste0("Shrink_",AnalysisID)]] <- raw_shrink_plots
raw_res_anno = data.frame(lapply(raw_res_anno, as.character), stringsAsFactors=FALSE)
shrink_res_anno = data.frame(lapply(shrink_res_anno, as.character), stringsAsFactors=FALSE)
write.csv(raw_res_anno, file=paste0(AnalysisID ,"_RawDE.csv"))
write.csv(shrink_res_anno,file=paste0(AnalysisID ,"_ShrinkDE.csv"))
}
The lfcShrink function in DESeq2 is designed to perform shrinkage estimation of the log2 fold changes (log2FC) in differential expression analysis. DESeq2 is a popular R package used for analyzing count-based data, like RNA-Seq or other forms of genomic data that come in the form of read counts. The main goal of DESeq2 is to find differentially expressed genes between experimental conditions by modeling count data using negative binomial distributions.
The lfcShrink method is particularly useful because it addresses a common issue in differential expression analysis: the estimation of log2 fold changes, especially for genes with low counts or high variance, can be noisy, leading to overestimation of the true effect sizes. Shrinkage estimators pull the estimated log2 fold changes towards zero (or towards a specified prior if different from zero), reducing the influence of noisy estimates on downstream analysis and interpretation. This process tends to improve the stability and reliability of the log2 fold change estimates.
A plot that can be useful to exploring our results is the MA plot. The MA plot shows the mean of the normalized counts versus the log2 foldchanges for all genes tested. The genes that are significantly DE are colored to be easily identified. This is also a great way to illustrate the effect of LFC shrinkage. The DESeq2 package offers a simple function to generate an MA plot. That is, many of the low expressors exhibit very high fold changes. After shrinkage, we see the fold changes are much smaller estimates.
NULL
NULL
NULL
NULL
NULL
NULL
The plots below are scatter plot of Raw and shrink data to further illustrate effect of shrinkage.
Volcano plot is a type of scatter plot that is commonly used in bioinformatics to visually display the results of differential expression analyses, among other applications. It plots significance versus fold-change on the y and x axes, respectively, making it an effective tool for quickly identifying genes that are significantly differentially expressed between two experimental conditions.
In this section we will explore genes that are differentially expressed between each comparison and the overlap.
rm(list=ls())
up_gene_list <- list()
down_gene_list <- list()
DE_gene_list <- list()
resultdf <- data.frame()
padj=0.05
log2fc=0.58
files<-list.files(".", pattern = "DE.csv$")
prefx<-c("PNA.10b_RAW","PNA.10b_shrink","PNA.21_Raw","PNA.21_shrink", "PNA10b.21_raw","PNA10b.21_shrink")
for (i in 1:length(files)){
fileName=files[i]
pref=prefx[i]
df<-read.csv(fileName) %>%
column_to_rownames(., var="Row.names") %>%
dplyr::select(.,log2FoldChange,padj,gene_name) %>%
filter(.,padj < 0.05 & abs(log2FoldChange) > 0.58)
up_gene_list[[pref]]<-rownames(df[df$log2FoldChange > 0.58,])
down_gene_list[[pref]]<-rownames(df[df$log2FoldChange < (-0.58),])
DE_gene_list[[pref]] <-rownames(df)
resultdf<-rbind(resultdf,data.frame("Comparison"=pref,"Up"=length(up_gene_list[[pref]]), "Down"=length(down_gene_list[[pref]]),"totalDE"=length(DE_gene_list[[pref]])))
}
suppressPackageStartupMessages(library(ggvenn,quietly = TRUE))
vennplot<-list()
venngenelist<-list("up"=up_gene_list, "down"=down_gene_list, "DE"=DE_gene_list)
for(ven in names(venngenelist)){
p1<-ggvenn(
venngenelist[[ven]][c(1,3,5)],
fill_color = c("steelblue1","lightgreen","violet"),
stroke_size = 0.5, set_name_size = 6
)
p2<-ggvenn(
venngenelist[[ven]][c(2,4,6)],
fill_color = c("steelblue1","lightgreen","violet"),
stroke_size = 0.5, set_name_size = 6
)
vennplot[[paste0(ven,"_raw")]]<-p1
vennplot[[paste0(ven,"_shrink")]]<-p2
}
resultdf
In order to get an estimate of overlap between different comparisons lets put some venn diagram together.
pna10b_21<-read.csv("condition_PNA10b.21_vs_control_14March2024_RawDE.csv")%>%
column_to_rownames(., var="Row.names") %>%
dplyr::select(., log2FoldChange,padj,gene_name,entrez)%>%
filter(.,padj < 0.05)%>%
arrange(desc(log2FoldChange))
colnames(pna10b_21)<-paste0("pna10b.21_",colnames(pna10b_21))
pna10b.21_rank<-pna10b_21$pna10b.21_log2FoldChange
names(pna10b.21_rank)<-as.character(pna10b_21$pna10b.21_entrez)
pna10b.21_rank_f<-pna10b.21_rank[!is.na(names(pna10b.21_rank))]
dwn_de<-names(pna10b.21_rank[pna10b.21_rank<(-0.58)])
dwn_de<-dwn_de[!is.na(dwn_de)]
wiki<-enrichWP(dwn_de, organism = "Homo sapiens")
dotplot(wiki, showCategory=30)
ggoMF <- groupGO(gene = dwn_de,
OrgDb = org.Hs.eg.db,
ont = "MF",
level = 2,
readable = TRUE) %>%
arrange(desc(Count))
head(ggoMF[,c(1:4)],20)
ggoBP <- groupGO(gene = dwn_de,
OrgDb = org.Hs.eg.db,
ont = "BP",
level = 6,
readable = TRUE) %>%
arrange(desc(Count))
head(ggoBP[,c(1:4)],20)
library("survival")
library("ggsurvfit")
library("gtsummary")
##
## Attaching package: 'gtsummary'
## The following object is masked from 'package:AnnotationDbi':
##
## select
miRexp<-fread("./gdac.broadinstitute.org_GBMLGG.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2016012800.0.0/miRNA_exp_survival_HH_LL.csv")
clinout<-fread("./gdac.broadinstitute.org_GBMLGG.Clinical_Pick_Tier1.Level_4.2016012800.0.0/GBMLGG.clin.merged.picked.txt")
head(miRexp)
head(clinout)
df<-merge(miRexp,clinout,by.x="Hybridization REF", by.y="Hybridization_REF",all.x=T, sort=F)
head(df)
df$status<-0
df<-df[,c("Hybridization REF","Status","days_to_death","status")]
colnames(df)<-c("patID","exprs","time","status")
df<-df[complete.cases(df$time)]
head(df)
df$expCoded<-0
df$expCoded[df$exprs=="HH"]<-1
coxph(Surv(time, status) ~ exprs, data = df)
Call: coxph(formula = Surv(time, status) ~ exprs, data = df)
coef exp(coef) se(coef) z p
exprsLL NA NA 0 NA NA
Likelihood ratio test=0 on 0 df, p=1 n= 73, number of events= 0
df<-df[complete.cases(df$patID)]
p <- survfit2(Surv(time) ~ expCoded, data = df) |>
ggsurvfit(linewidth = 1) +
add_confidence_interval() +
add_risktable() +
add_quantile(y_value = 0.6, color = "gray50", linewidth = 0.75) +
scale_ggsurvfit()
p